{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from sklearn.model_selection import train_test_split\n",
    "import numpy as np\n",
    "from collections import Counter\n",
    "import tensorflow as tf\n",
    "\n",
    "import os\n",
    "import pickle\n",
    "import re\n",
    "from tensorflow.python.ops import math_ops"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "\n",
    "# 原始数据文档\n",
    "* https://tianchi.aliyun.com/datalab/dataSet.html?dataId=408\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 样本骨架 数据结构\n",
    "<img src=\"assets/sample_skeleton.jpg\"/>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Common Feature 数据结构\n",
    "<img src=\"assets/common_feature.jpg\"/>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Common Feature表处理（随机抽取2.5%的UserID和对应的MD5)\n",
    "MD5的计算逻辑是MD5(userID+commonfeature+page)，同一个用户一天浏览多个page，所以，有多个MD5\n",
    "### Training Set 所有的UID汇总"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#-*- coding:utf-8 -*-\n",
    "f = open(\"xxxxx/sample_train/common_features_train.csv\",'r')   \n",
    "train_total_uid_set = set()\n",
    "train_total_md5_set = set()\n",
    "index = 0\n",
    "for line in f:\n",
    "    tokens = line.strip().split(\",\")\n",
    "    value=0\n",
    "    md5 = tokens[0]\n",
    "    feature_arr = tokens[2].split('\\001')\n",
    "    for fea_kv in feature_arr:\n",
    "        fea_field_id = fea_kv.split('\\002')[0]\n",
    "        fea_id_val = fea_kv.split('\\002')[1]\n",
    "        fea_id = fea_id_val.split('\\003')[0]\n",
    "        fea_val = fea_id_val.split('\\003')[1]\n",
    "        if fea_field_id == '101':\n",
    "            train_total_uid_set.add(fea_id);\n",
    "            train_total_md5_set.add(md5)\n",
    "            break;\n",
    "    index += 1\n",
    "    if index % 10000 == 0:\n",
    "        print(\"current_index:\",index)\n",
    "f.close()\n",
    "f_o.close()\n",
    "\n",
    "pickle.dump(train_total_uid_set, open('./ctr_cvr_data/train_total_uid_set.p', 'wb'))\n",
    "pickle.dump(train_total_md5_set, open('./ctr_cvr_data/train_total_md5_set.p', 'wb'))\n",
    "print(0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "train_total_uid_set = pickle.load(open('./ctr_cvr_data/train_total_uid_set.p', mode='rb'))\n",
    "train_total_md5_set = pickle.load(open('./ctr_cvr_data/train_total_md5_set.p', mode='rb'))\n",
    "print(len(train_total_uid_set))\n",
    "print(len(train_total_md5_set))\n",
    "count = 0\n",
    "for k in train_total_uid_set:\n",
    "    value=0\n",
    "    for a in k:\n",
    "        value += ord(a)\n",
    "    if value%50== 0:\n",
    "        count = count+1\n",
    "print(count)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 训练集Common Feature 2.5%采样"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#-*- coding:utf-8 -*-\n",
    "f = open(\"xxxxx/sample_train/common_features_train.csv\",'r')   \n",
    "f_o = open(\"./ctr_cvr_data/common_features_skeleton_train_sample_2_percent.csv\",'w') \n",
    "train_uid_set = set()\n",
    "train_md5_set = set()\n",
    "index = 0\n",
    "for line in f:\n",
    "    tokens = line.strip().split(\",\")\n",
    "    value=0\n",
    "    md5 = tokens[0]\n",
    "    feature_arr = tokens[2].split('\\001')\n",
    "    for fea_kv in feature_arr:\n",
    "        fea_field_id = fea_kv.split('\\002')[0]\n",
    "        fea_id_val = fea_kv.split('\\002')[1]\n",
    "        fea_id = fea_id_val.split('\\003')[0]\n",
    "        fea_val = fea_id_val.split('\\003')[1]\n",
    "        if fea_field_id == '101':\n",
    "            value=0\n",
    "            # 字符相加\n",
    "            for a in fea_id:\n",
    "                value += ord(a)\n",
    "            # uid mod 40 等价于采样2.5%\n",
    "            if value%40 == 0:\n",
    "                f_o.write(line)\n",
    "                train_uid_set.add(fea_id);\n",
    "                train_md5_set.add(md5)\n",
    "            break;\n",
    "    index += 1\n",
    "    if index % 10000 == 0:\n",
    "        print(\"current_index:\",index)\n",
    "f.close()\n",
    "f_o.close()\n",
    "\n",
    "pickle.dump(train_uid_set, open('./ctr_cvr_data/train_uid_set.p', 'wb'))\n",
    "pickle.dump(train_md5_set, open('./ctr_cvr_data/train_md5_set.p', 'wb'))\n",
    "print(0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "train_uid_set = pickle.load(open('./ctr_cvr_data/train_uid_set.p', mode='rb'))\n",
    "train_md5_set = pickle.load(open('./ctr_cvr_data/train_md5_set.p', mode='rb'))\n",
    "print(len(train_uid_set))\n",
    "print(len(train_md5_set))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Test Set Common Feature 2.5% 采样"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#-*- coding:utf-8 -*-\n",
    "f = open(\"xxxxx/sample_test/common_features_test.csv\",'r')   \n",
    "f_o = open(\"./ctr_cvr_data/common_features_skeleton_test_sample_2_percent.csv\",'w') \n",
    "\n",
    "test_uid_set = set()\n",
    "test_md5_set = set()\n",
    "index = 0\n",
    "for line in f:\n",
    "    tokens = line.strip().split(\",\")\n",
    "    value=0\n",
    "    md5 = tokens[0]\n",
    "    feature_arr = tokens[2].split('\\001')\n",
    "    for fea_kv in feature_arr:\n",
    "        fea_field_id = fea_kv.split('\\002')[0]\n",
    "        fea_id_val = fea_kv.split('\\002')[1]\n",
    "        fea_id = fea_id_val.split('\\003')[0]\n",
    "        fea_val = fea_id_val.split('\\003')[1]\n",
    "        if fea_field_id == '101':\n",
    "            value=0\n",
    "             # 字符相加\n",
    "            for a in fea_id:\n",
    "                value += ord(a)\n",
    "            # uid mod 40 等价于采样2.5%\n",
    "            if value%40 == 0:\n",
    "                f_o.write(line)\n",
    "                test_uid_set.add(fea_id);\n",
    "                test_md5_set.add(md5)\n",
    "            break;\n",
    "    index += 1\n",
    "    if index % 10000 == 0:\n",
    "        print(\"current_index:\",index)\n",
    "f.close()\n",
    "f_o.close()\n",
    "\n",
    "pickle.dump(test_uid_set, open('./ctr_cvr_data/test_uid_set.p', 'wb'))\n",
    "pickle.dump(test_md5_set, open('./ctr_cvr_data/test_md5_set.p', 'wb'))\n",
    "print(0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "test_uid_set = pickle.load(open('./ctr_cvr_data/test_uid_set.p', mode='rb'))\n",
    "test_md5_set = pickle.load(open('./ctr_cvr_data/test_md5_set.p', mode='rb'))\n",
    "print(len(test_uid_set))\n",
    "print(len(test_md5_set))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Train&Test uid 交集\n",
    "uid_common = train_uid_set & test_uid_set\n",
    "md5_common = train_md5_set & test_md5_set\n",
    "print(len(uid_common))\n",
    "print(len(md5_common))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 样本骨架处理（抽取上面有效md5对应的样本)\n",
    "### 训练骨架"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#-*- coding:utf-8 -*-\n",
    "f = open(\"xxxxx/ESMM开源/sample_train/sample_skeleton_train.csv\",'r')   \n",
    "f_o = open(\"./ctr_cvr_data/sample_skeleton_train_sample_2_percent.csv\",'w') \n",
    "\n",
    "index = 0\n",
    "for line in f:\n",
    "    tokens = line.strip().split(\",\")\n",
    "    if tokens[3] in train_md5_set:\n",
    "        f_o.write(line)\n",
    "    index += 1\n",
    "    if index % 1000000 == 0:\n",
    "        print(\"current_index:\",index)\n",
    "f.close()\n",
    "f_o.close()\n",
    "print(0)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 测试骨架"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#-*- coding:utf-8 -*-\n",
    "f = open(\"xxxx/ESMM开源/sample_test/sample_skeleton_test.csv\",'r')   \n",
    "f_o = open(\"./ctr_cvr_data/sample_skeleton_test_sample_2_percent.csv\",'w') \n",
    "index = 0\n",
    "for line in f:\n",
    "    tokens = line.strip().split(\",\")\n",
    "    if tokens[3] in test_md5_set:\n",
    "        f_o.write(line)\n",
    "    index += 1\n",
    "    if index % 1000000 == 0:\n",
    "        print(\"current_index:\",index)\n",
    "f.close()\n",
    "f_o.close()\n",
    "print(0)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
